{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/train-en-ms.tar.gz\n",
    "# !tar -zxf train-en-ms.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train-en/left.txt') as fopen:\n",
    "    left = fopen.read().split('\\n')\n",
    "    \n",
    "with open('train-en/right.txt') as fopen:\n",
    "    right = fopen.read().split('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3807616, 3807616)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(left), len(right)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "left = left[:800000]\n",
    "right = right[:800000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    string = string.replace('\\n', ' ').replace('\\t', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import tensorflow_datasets as tfds\n",
    "from t5.data import preprocessors as prep\n",
    "import functools\n",
    "import t5\n",
    "import gin\n",
    "import sentencepiece as spm\n",
    "from glob import glob\n",
    "import os\n",
    "\n",
    "gin.parse_config_file('pretrained_models_base_operative_config.gin')\n",
    "vocab = 'sp10m.cased.ms-en.model'\n",
    "sp = spm.SentencePieceProcessor()\n",
    "sp.Load(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "with tf.io.gfile.GFile('en-ms.tsv', \"w\") as outfile:\n",
    "    for i in range(len(left)):\n",
    "        l = cleaning(left[i])\n",
    "        r = cleaning(right[i])\n",
    "        outfile.write(\"%s\\t%s\\n\" % (l, r))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def en_ms_dataset(split, shuffle_files = False):\n",
    "    del shuffle_files\n",
    "    ds = tf.data.TextLineDataset(\n",
    "        [\n",
    "            'en-ms.tsv'\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    ds = ds.map(\n",
    "        functools.partial(\n",
    "            tf.io.decode_csv,\n",
    "            record_defaults = ['', ''],\n",
    "            field_delim = '\\t',\n",
    "            use_quote_delim = False,\n",
    "        ),\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )\n",
    "    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))\n",
    "    return ds\n",
    "\n",
    "def en_ms_preprocessor(ds):\n",
    "    def to_inputs_and_targets(ex):\n",
    "        return {\n",
    "            'inputs': tf.strings.join(['terjemah Inggeris ke Melayu: ', ex['question']]),\n",
    "            'targets': ex['answer'],\n",
    "        }\n",
    "\n",
    "    return ds.map(\n",
    "        to_inputs_and_targets,\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "t5.data.TaskRegistry.remove('en_ms_dataset')\n",
    "t5.data.TaskRegistry.add(\n",
    "    'en_ms_dataset',\n",
    "    dataset_fn = en_ms_dataset,\n",
    "    splits = ['train'],\n",
    "    text_preprocessor = [en_ms_preprocessor],\n",
    "    sentencepiece_model_path = vocab,\n",
    "    metric_fns = [t5.evaluation.metrics.accuracy],\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "nq_task = t5.data.TaskRegistry.get(\"en_ms_dataset\")\n",
    "ds = nq_task.get_dataset(split='en-ms.tsv', sequence_length={\"inputs\": 1024, \"targets\": 1024})\n",
    "r = tfds.as_numpy(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'inputs_plaintext': b'terjemah Inggeris ke Melayu: Lurie says the current real estate dip may last longer than previous lulls, but she says the numbers were worse in 2009. Homes priced under $500,000 are still selling relatively well. Lurie says it\\'s the luxury homes and apartments that are suffering. New listings in the $600,000-plus category are on the rise, but sales are down from last year. CREB says that price range represented about 18 per cent of sales compared with 20 per cent last year. \"With more options in the higher-end of the market, sellers will need to consider their competition as well as their goals regarding a sell date,\" said CREB president Corinne Lyall in a release. \"This will influence the pricing strategy they agree upon with their real estate professional\". Justin Bobier, with Crystal Creek Homes, says his company\\'s luxury home sales are down 50 per cent this year. He says a lot of buyers are playing the waiting game. \"Reality is we\\'re not going to see a price drop in new home sales this year,\" he said.',\n",
       " 'inputs': array([   13, 26087,  2040,    55,  1550,    31,  1599,  5283,   225,\n",
       "           15,  1523,   911,  3711,    24,   453,   366,   239,  2100,\n",
       "          149,  2747,    13,   909,   415,    16,    14,    96,   148,\n",
       "          225,    15,  3044,   107,  4434,    23,  1175,     3,  3099,\n",
       "           16,    13, 13203,   360, 25970,    54,   341,  4203,  6190,\n",
       "          316,     3,  1599,  5283,   225,    43,    12,    16,    15,\n",
       "         7091,  3657,    20,  5464,    16,    27,    54,  8352,     3,\n",
       "          141, 14563,    16,    23,    15,   129, 19138,     7, 13445,\n",
       "         7364,    54,    32,    15,  3027,    14,    96,  1444,    54,\n",
       "          384,    58,   239,   191,     3,   389,  3975,   347,   225,\n",
       "           27,  1271,  3530,  7510,   106,   375,   344,  3325,    18,\n",
       "         1444,  2568,    40,   361,   344,  3325,   239,   191,     3,\n",
       "           13,     6,  9439,    99,  5249,    23,    15,  1705,     7,\n",
       "         2372,    18,    15,   583,    14, 19598,    82,   446,    19,\n",
       "         3487,    92,  3081,    44,   316,    44,    92,  5041,  7954,\n",
       "           21,  2693,  2379,    14,     6,    49,   389,  3975,   347,\n",
       "          678,  3656,   153,   712,  7639,  3238,    23,    21,  2170,\n",
       "            3,    13,     6,  1310,    82,  4159,    15, 14591,  3753,\n",
       "           88,  4360,  4828,    40,    92,   911,  3711,  3651,     6,\n",
       "            3,  4665,  3361,  3065,    14,    40, 16791, 14481,  3099,\n",
       "           16,    14,   225,    68,   301,    12,    16,  7091,   435,\n",
       "         1444,    54,   384,   790,   344,  3325,    74,   191,     3,\n",
       "          135,   225,    21,   689,    18,  8068,    54,  2808,    15,\n",
       "         5038,   896,     3,    13,     6, 19572,  1052,    26,   111,\n",
       "           12,    94,    69,   399,    19,   426,    21,  1271,  4007,\n",
       "           23,   178,   435,  1444,    74,   191,    14,     6,    57,\n",
       "           49,     3,     1]),\n",
       " 'targets_plaintext': b'Lurie mengatakan penurunan harta tanah ketika ini mungkin bertahan lebih lama daripada ketenangan sebelumnya, tetapi dia mengatakan jumlahnya lebih buruk pada tahun 2009. Rumah yang berharga di bawah $ 500,000 masih dijual dengan agak baik. Lurie mengatakan ia adalah rumah dan pangsapuri mewah yang menderita. Penyenaraian baru dalam kategori $ 600,000 ditambah semakin meningkat, tetapi penjualan menurun berbanding tahun lalu. CREB mengatakan bahawa julat harga mewakili sekitar 18 peratus penjualan berbanding 20 peratus tahun lalu. \"Dengan lebih banyak pilihan di pasaran yang lebih tinggi, penjual perlu mempertimbangkan persaingan mereka dan juga tujuan mereka mengenai tarikh penjualan,\" kata presiden CREB, Corinne Lyall dalam siarannya. \"Ini akan mempengaruhi strategi penentuan harga yang mereka setujui dengan profesional harta tanah mereka\". Justin Bobier, dengan Crystal Creek Homes, mengatakan penjualan rumah mewah syarikatnya turun 50 peratus tahun ini. Dia mengatakan bahawa banyak pembeli bermain permainan menunggu. \"Kenyataannya adalah kita tidak akan melihat penurunan harga penjualan rumah baru tahun ini,\" katanya.',\n",
       " 'targets': array([ 1599,  5283,   101,  2243,  1762,   622,   123,    34,   167,\n",
       "         2946,    65,   461,   109, 18516,   843,    14,   113,    61,\n",
       "          101, 10264,    65,  1176,    33,    53,  1175,     3,  2472,\n",
       "           17,  3422,    24,   292,   129, 13557,   274,  2891,    28,\n",
       "         2302,   187,     3,  1599,  5283,   101,   171,    52,   233,\n",
       "           22,  8429,  4090,    17,  5905,     3, 10866,   476,  5345,\n",
       "           47,   143,    36,  4871,   129,    13, 19138,  6995,   781,\n",
       "          543,    14,   113,  1280,  4450,  1325,    53,   186,     3,\n",
       "          389,  3975,   347,   101,    56, 20317,   431,  2860,   462,\n",
       "          375,   394,  1280,  1325,   361,   394,    53,   186,     3,\n",
       "           13,     6,  8996,    65,   108,   507,    24,   635,    17,\n",
       "           65,   329,    14, 10454,   315,    13,  3300,  5563,    46,\n",
       "           22,    93,  2143,    46,   137,  3123,  1280,    14,     6,\n",
       "           98,   405,   389,  3975,   347,    14,  3656,   153,   712,\n",
       "         7639,  3238,    36,  5020,    38,     3,    13,     6,   886,\n",
       "           45,  3175,  2453, 20064,   431,    17,    46,  5073,    91,\n",
       "           28,  2801,  1762,   622,    46,     6,     3,  4665,  3361,\n",
       "         3065,    14,    28, 16791, 14481,  3099,    16,    14,   101,\n",
       "         1280,   233,  4090, 12541,  1088,   790,   394,    53,    34,\n",
       "            3,   160,   101,    56,   108,  4356,   964,   667,  2160,\n",
       "            3,    13,     6, 31563,    38,    52,   115,    30,    45,\n",
       "          244,  2243,   431,  1280,   233,   143,    53,    34,    14,\n",
       "            6,   194,     3,     1])}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -rf train-en"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
